{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Import necessary depencencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import text_normalizer as tn\n",
    "import model_evaluation_utils as meu\n",
    "\n",
    "np.set_printoptions(precision=2, linewidth=80)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load and normalize data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                              review sentiment\n",
      "0  One of the other reviewers has mentioned that ...  positive\n",
      "1  A wonderful little production. <br /><br />The...  positive\n",
      "2  I thought this was a wonderful way to spend ti...  positive\n",
      "3  Basically there's a family where a little boy ...  negative\n",
      "4  Petter Mattei's \"Love in the Time of Money\" is...  positive\n"
     ]
    }
   ],
   "source": [
    "dataset = pd.read_csv(r'movie_reviews.csv')\n",
    "\n",
    "# take a peek at the data\n",
    "print(dataset.head())\n",
    "reviews = np.array(dataset['review'])\n",
    "sentiments = np.array(dataset['sentiment'])\n",
    "\n",
    "# build train and test datasets\n",
    "train_reviews = reviews[:35000]\n",
    "train_sentiments = sentiments[:35000]\n",
    "test_reviews = reviews[35000:]\n",
    "test_sentiments = sentiments[35000:]\n",
    "\n",
    "# normalize datasets\n",
    "norm_train_reviews = tn.normalize_corpus(train_reviews)\n",
    "norm_test_reviews = tn.normalize_corpus(test_reviews)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tokenize train & test datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "tokenized_train = [tn.tokenizer.tokenize(text) for text in norm_train_reviews]\n",
    "tokenized_test = [tn.tokenizer.tokenize(text) for text in norm_test_reviews]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Build Vocabulary Mapping (word to index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vocabulary Size: 82358\n",
      "Sample slice of vocabulary map: {'martyrdom': 6, 'palmira': 7, 'servility': 8, 'gardening': 9, 'melodramatically': 73505, 'renfro': 41282, 'carlin': 41283, 'overtly': 41284, 'rend': 47891, 'anticlimactic': 51}\n"
     ]
    }
   ],
   "source": [
    "from collections import Counter\n",
    "\n",
    "# build word to index vocabulary\n",
    "token_counter = Counter([token for review in tokenized_train for token in review])\n",
    "vocab_map = {item[0]: index+1 for index, item in enumerate(dict(token_counter).items())}\n",
    "max_index = np.max(list(vocab_map.values()))\n",
    "vocab_map['PAD_INDEX'] = 0\n",
    "vocab_map['NOT_FOUND_INDEX'] = max_index+1\n",
    "vocab_size = len(vocab_map)\n",
    "# view vocabulary size and part of the vocabulary map\n",
    "print('Vocabulary Size:', vocab_size)\n",
    "print('Sample slice of vocabulary map:', dict(list(vocab_map.items())[10:20]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Encode and Pad datasets & Encode prediction class labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Max length of train review vectors: 1442\n",
      "Train review vectors shape: (35000, 1442)  Test review vectors shape: (15000, 1442)\n"
     ]
    }
   ],
   "source": [
    "from keras.preprocessing import sequence\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "# get max length of train corpus and initialize label encoder\n",
    "le = LabelEncoder()\n",
    "num_classes=2 # positive -> 1, negative -> 0\n",
    "max_len = np.max([len(review) for review in tokenized_train])\n",
    "\n",
    "## Train reviews data corpus\n",
    "# Convert tokenized text reviews to numeric vectors\n",
    "train_X = [[vocab_map[token] for token in tokenized_review] for tokenized_review in tokenized_train]\n",
    "train_X = sequence.pad_sequences(train_X, maxlen=max_len) # pad \n",
    "## Train prediction class labels\n",
    "# Convert text sentiment labels (negative\\positive) to binary encodings (0/1)\n",
    "train_y = le.fit_transform(train_sentiments)\n",
    "\n",
    "## Test reviews data corpus\n",
    "# Convert tokenized text reviews to numeric vectors\n",
    "test_X = [[vocab_map[token] if vocab_map.get(token) else vocab_map['NOT_FOUND_INDEX'] \n",
    "           for token in tokenized_review] \n",
    "              for tokenized_review in tokenized_test]\n",
    "test_X = sequence.pad_sequences(test_X, maxlen=max_len)\n",
    "## Test prediction class labels\n",
    "# Convert text sentiment labels (negative\\positive) to binary encodings (0/1)\n",
    "test_y = le.transform(test_sentiments)\n",
    "\n",
    "# view vector shapes\n",
    "print('Max length of train review vectors:', max_len)\n",
    "print('Train review vectors shape:', train_X.shape, ' Test review vectors shape:', test_X.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Build the LSTM Model Architecture"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Embedding, Dropout, SpatialDropout1D\n",
    "from keras.layers import LSTM\n",
    "\n",
    "EMBEDDING_DIM = 128 # dimension for dense embeddings for each token\n",
    "LSTM_DIM = 64 # total LSTM units\n",
    "\n",
    "model = Sequential()\n",
    "model.add(Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_len))\n",
    "model.add(SpatialDropout1D(0.2))\n",
    "model.add(LSTM(LSTM_DIM, dropout=0.2, recurrent_dropout=0.2))\n",
    "model.add(Dense(1, activation=\"sigmoid\"))\n",
    "\n",
    "model.compile(loss=\"binary_crossentropy\", optimizer=\"adam\",\n",
    "              metrics=[\"accuracy\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "embedding_4 (Embedding)      (None, 1442, 128)         10541824  \n",
      "_________________________________________________________________\n",
      "spatial_dropout1d_4 (Spatial (None, 1442, 128)         0         \n",
      "_________________________________________________________________\n",
      "lstm_3 (LSTM)                (None, 64)                49408     \n",
      "_________________________________________________________________\n",
      "dense_3 (Dense)              (None, 1)                 65        \n",
      "=================================================================\n",
      "Total params: 10,591,297\n",
      "Trainable params: 10,591,297\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "print(model.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualize model architecture"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<svg height=\"78pt\" viewBox=\"0.00 0.00 1118.00 78.00\" width=\"1118pt\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g class=\"graph\" id=\"graph0\" transform=\"scale(1 1) rotate(0) translate(4 74)\">\n",
       "<title>G</title>\n",
       "<polygon fill=\"white\" points=\"-4,4 -4,-74 1114,-74 1114,4 -4,4\" stroke=\"none\"/>\n",
       "<!-- 2972249086720 -->\n",
       "<g class=\"node\" id=\"node1\"><title>2972249086720</title>\n",
       "<polygon fill=\"none\" points=\"0,-0.5 0,-69.5 180,-69.5 180,-0.5 0,-0.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"90\" y=\"-54.3\">InputLayer</text>\n",
       "<polyline fill=\"none\" points=\"0,-46.5 180,-46.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"42.5\" y=\"-31.3\">input:</text>\n",
       "<polyline fill=\"none\" points=\"85,-23.5 85,-46.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"132.5\" y=\"-31.3\">output:</text>\n",
       "<polyline fill=\"none\" points=\"0,-23.5 180,-23.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"45\" y=\"-8.3\">(None, 1442)</text>\n",
       "<polyline fill=\"none\" points=\"90,-0.5 90,-23.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"135\" y=\"-8.3\">(None, 1442)</text>\n",
       "</g>\n",
       "<!-- 2972249086552 -->\n",
       "<g class=\"node\" id=\"node2\"><title>2972249086552</title>\n",
       "<polygon fill=\"none\" points=\"216,-0.5 216,-69.5 424,-69.5 424,-0.5 216,-0.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"320\" y=\"-54.3\">Embedding</text>\n",
       "<polyline fill=\"none\" points=\"216,-46.5 424,-46.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"265.5\" y=\"-31.3\">input:</text>\n",
       "<polyline fill=\"none\" points=\"315,-23.5 315,-46.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"369.5\" y=\"-31.3\">output:</text>\n",
       "<polyline fill=\"none\" points=\"216,-23.5 424,-23.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"261\" y=\"-8.3\">(None, 1442)</text>\n",
       "<polyline fill=\"none\" points=\"306,-0.5 306,-23.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"365\" y=\"-8.3\">(None, 1442, 128)</text>\n",
       "</g>\n",
       "<!-- 2972249086720&#45;&gt;2972249086552 -->\n",
       "<g class=\"edge\" id=\"edge1\"><title>2972249086720-&gt;2972249086552</title>\n",
       "<path d=\"M180.13,-35C188.569,-35 197.198,-35 205.821,-35\" fill=\"none\" stroke=\"black\"/>\n",
       "<polygon fill=\"black\" points=\"205.93,-38.5001 215.93,-35 205.93,-31.5001 205.93,-38.5001\" stroke=\"black\"/>\n",
       "</g>\n",
       "<!-- 2972249086608 -->\n",
       "<g class=\"node\" id=\"node3\"><title>2972249086608</title>\n",
       "<polygon fill=\"none\" points=\"460,-0.5 460,-69.5 696,-69.5 696,-0.5 460,-0.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"578\" y=\"-54.3\">SpatialDropout1D</text>\n",
       "<polyline fill=\"none\" points=\"460,-46.5 696,-46.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"516.5\" y=\"-31.3\">input:</text>\n",
       "<polyline fill=\"none\" points=\"573,-23.5 573,-46.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"634.5\" y=\"-31.3\">output:</text>\n",
       "<polyline fill=\"none\" points=\"460,-23.5 696,-23.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"519\" y=\"-8.3\">(None, 1442, 128)</text>\n",
       "<polyline fill=\"none\" points=\"578,-0.5 578,-23.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"637\" y=\"-8.3\">(None, 1442, 128)</text>\n",
       "</g>\n",
       "<!-- 2972249086552&#45;&gt;2972249086608 -->\n",
       "<g class=\"edge\" id=\"edge2\"><title>2972249086552-&gt;2972249086608</title>\n",
       "<path d=\"M424.018,-35C432.507,-35 441.147,-35 449.783,-35\" fill=\"none\" stroke=\"black\"/>\n",
       "<polygon fill=\"black\" points=\"449.909,-38.5001 459.909,-35 449.909,-31.5001 449.909,-38.5001\" stroke=\"black\"/>\n",
       "</g>\n",
       "<!-- 2972249087392 -->\n",
       "<g class=\"node\" id=\"node4\"><title>2972249087392</title>\n",
       "<polygon fill=\"none\" points=\"732,-0.5 732,-69.5 927,-69.5 927,-0.5 732,-0.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"829.5\" y=\"-54.3\">LSTM</text>\n",
       "<polyline fill=\"none\" points=\"732,-46.5 927,-46.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"778.5\" y=\"-31.3\">input:</text>\n",
       "<polyline fill=\"none\" points=\"825,-23.5 825,-46.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"876\" y=\"-31.3\">output:</text>\n",
       "<polyline fill=\"none\" points=\"732,-23.5 927,-23.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"791\" y=\"-8.3\">(None, 1442, 128)</text>\n",
       "<polyline fill=\"none\" points=\"850,-0.5 850,-23.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"888.5\" y=\"-8.3\">(None, 64)</text>\n",
       "</g>\n",
       "<!-- 2972249086608&#45;&gt;2972249087392 -->\n",
       "<g class=\"edge\" id=\"edge3\"><title>2972249086608-&gt;2972249087392</title>\n",
       "<path d=\"M696.079,-35C704.724,-35 713.419,-35 721.995,-35\" fill=\"none\" stroke=\"black\"/>\n",
       "<polygon fill=\"black\" points=\"722,-38.5001 732,-35 722,-31.5001 722,-38.5001\" stroke=\"black\"/>\n",
       "</g>\n",
       "<!-- 2972249154672 -->\n",
       "<g class=\"node\" id=\"node5\"><title>2972249154672</title>\n",
       "<polygon fill=\"none\" points=\"963,-0.5 963,-69.5 1110,-69.5 1110,-0.5 963,-0.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"1036.5\" y=\"-54.3\">Dense</text>\n",
       "<polyline fill=\"none\" points=\"963,-46.5 1110,-46.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"997.5\" y=\"-31.3\">input:</text>\n",
       "<polyline fill=\"none\" points=\"1032,-23.5 1032,-46.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"1071\" y=\"-31.3\">output:</text>\n",
       "<polyline fill=\"none\" points=\"963,-23.5 1110,-23.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"1001.5\" y=\"-8.3\">(None, 64)</text>\n",
       "<polyline fill=\"none\" points=\"1040,-0.5 1040,-23.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"1075\" y=\"-8.3\">(None, 1)</text>\n",
       "</g>\n",
       "<!-- 2972249087392&#45;&gt;2972249154672 -->\n",
       "<g class=\"edge\" id=\"edge4\"><title>2972249087392-&gt;2972249154672</title>\n",
       "<path d=\"M927.297,-35C935.773,-35 944.301,-35 952.648,-35\" fill=\"none\" stroke=\"black\"/>\n",
       "<polygon fill=\"black\" points=\"952.725,-38.5001 962.725,-35 952.725,-31.5001 952.725,-38.5001\" stroke=\"black\"/>\n",
       "</g>\n",
       "</g>\n",
       "</svg>"
      ],
      "text/plain": [
       "<IPython.core.display.SVG object>"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from IPython.display import SVG\n",
    "from keras.utils.vis_utils import model_to_dot\n",
    "\n",
    "SVG(model_to_dot(model, show_shapes=True, show_layer_names=False, \n",
    "                 rankdir='LR').create(prog='dot', format='svg'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 31500 samples, validate on 3500 samples\n",
      "Epoch 1/5\n",
      "31500/31500 [==============================] - 2491s - loss: 0.4081 - acc: 0.8184 - val_loss: 0.3006 - val_acc: 0.8751\n",
      "Epoch 2/5\n",
      "31500/31500 [==============================] - 2489s - loss: 0.2253 - acc: 0.9158 - val_loss: 0.3209 - val_acc: 0.8780\n",
      "Epoch 3/5\n",
      "31500/31500 [==============================] - 2656s - loss: 0.1431 - acc: 0.9493 - val_loss: 0.3483 - val_acc: 0.8671\n",
      "Epoch 4/5\n",
      "31500/31500 [==============================] - 2604s - loss: 0.1023 - acc: 0.9658 - val_loss: 0.3803 - val_acc: 0.8729\n",
      "Epoch 5/5\n",
      "31500/31500 [==============================] - 2701s - loss: 0.0694 - acc: 0.9761 - val_loss: 0.4430 - val_acc: 0.8706\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x2b411229e80>"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "batch_size = 100\n",
    "model.fit(train_X, train_y, epochs=5, batch_size=batch_size, \n",
    "          shuffle=True, validation_split=0.1, verbose=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Predict and Evaluate Model Performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "15000/15000 [==============================] - 352s   \n"
     ]
    }
   ],
   "source": [
    "pred_test = model.predict_classes(test_X)\n",
    "predictions = le.inverse_transform(pred_test.flatten())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model Performance metrics:\n",
      "------------------------------\n",
      "Accuracy: 0.88\n",
      "Precision: 0.88\n",
      "Recall: 0.88\n",
      "F1 Score: 0.88\n",
      "\n",
      "Model Classification report:\n",
      "------------------------------\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "   positive       0.87      0.88      0.88      7510\n",
      "   negative       0.88      0.87      0.88      7490\n",
      "\n",
      "avg / total       0.88      0.88      0.88     15000\n",
      "\n",
      "\n",
      "Prediction Confusion Matrix:\n",
      "------------------------------\n",
      "                 Predicted:         \n",
      "                   positive negative\n",
      "Actual: positive       6633      877\n",
      "        negative        972     6518\n"
     ]
    }
   ],
   "source": [
    "meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predictions, \n",
    "                                      classes=['positive', 'negative'])  "
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
